{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9b8e4e7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install transformers -U"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "03b3fec5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig\n",
    "from transformers import CLIPVisionModel, CLIPImageProcessor\n",
    "from transformers import MistralConfig\n",
    "import numpy as np\n",
    "import requests\n",
    "from PIL import Image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f94fcca7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CLIPVisionConfig {\n",
       "  \"attention_dropout\": 0.0,\n",
       "  \"dropout\": 0.0,\n",
       "  \"hidden_act\": \"quick_gelu\",\n",
       "  \"hidden_size\": 768,\n",
       "  \"image_size\": 224,\n",
       "  \"initializer_factor\": 1.0,\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"intermediate_size\": 3072,\n",
       "  \"layer_norm_eps\": 1e-05,\n",
       "  \"model_type\": \"clip_vision_model\",\n",
       "  \"num_attention_heads\": 12,\n",
       "  \"num_channels\": 3,\n",
       "  \"num_hidden_layers\": 12,\n",
       "  \"patch_size\": 16,\n",
       "  \"projection_dim\": 512,\n",
       "  \"transformers_version\": \"4.36.2\"\n",
       "}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vision_config = CLIPVisionConfig.from_pretrained('openai/clip-vit-base-patch16')\n",
    "vision_config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0632f57c",
   "metadata": {},
   "outputs": [],
   "source": [
    "text_config = MistralConfig.from_pretrained('mesolitica/malaysian-mistral-7b-32k-instructions-v3.5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "42c02cd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "configuration = LlavaConfig(vision_config, text_config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b23b2054",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LlavaForConditionalGeneration(configuration)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "74a5ad40",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.vision_tower = model.vision_tower.from_pretrained('openai/clip-vit-base-patch16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d2cc402e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4e74c33e36b14bd3bf6c6670be716a66",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "model.language_model = model.language_model.from_pretrained('mesolitica/malaysian-mistral-7b-32k-instructions-v3.5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c5b138c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoProcessor, AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ad85d594",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "processor = AutoProcessor.from_pretrained('llava-hf/bakLlava-v1-hf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ca079edc",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-mistral-7b-32k-instructions-v3.5')\n",
    "tokenizer.add_tokens([\"<image>\", \"<pad>\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fc234527",
   "metadata": {},
   "outputs": [],
   "source": [
    "clip_processor = AutoProcessor.from_pretrained('openai/clip-vit-base-patch16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6c3ed233",
   "metadata": {},
   "outputs": [],
   "source": [
    "processor.tokenizer = tokenizer\n",
    "processor.image_processor.crop_size = clip_processor.image_processor.crop_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "9c17dc32",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Embedding(32002, 4096)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.resize_token_embeddings(len(tokenizer))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0acb6117",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt = \"USER: <image>\\nWhat are these?\\nASSISTANT:\"\n",
    "prompt1 = \"USER: <image>\\nWhat are these?\\nASSISTANT:\"\n",
    "image_file = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
    "vision_feature_layer = model.config.vision_feature_layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1fc72f38",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_image = Image.open(requests.get(image_file, stream=True).raw)\n",
    "inputs = processor(\n",
    "    [prompt, prompt1], \n",
    "    [np.array(raw_image), np.array(raw_image)], \n",
    "    return_tensors='pt',\n",
    "    padding=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "a8b66eb2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2, 15])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs.input_ids.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "e98c8e73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2, 3, 224, 224])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs.pixel_values.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "a2590b31",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2, 197, 768])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pixel_values = inputs['pixel_values']\n",
    "image_outputs = model.vision_tower(pixel_values, output_hidden_states=True)\n",
    "selected_image_feature = image_outputs.hidden_states[vision_feature_layer]\n",
    "selected_image_feature.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "5eb41059",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/huseinzol05/dummy-clip-vit-base-patch16-malaysian-mistral-7b-32k-instructions-v3.5/commit/866802f1eb3eb647ea533493e60361605e4cad29', commit_message='Upload processor', commit_description='', oid='866802f1eb3eb647ea533493e60361605e4cad29', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processor.push_to_hub('huseinzol05/dummy-clip-vit-base-patch16-malaysian-mistral-7b-32k-instructions-v3.5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d0103312",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "model = model.type(torch.bfloat16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "81e66237",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "69eb95a2bbfc45479fb88a1452aaa56a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00003-of-00003.safetensors:   0%|          | 0.00/4.86G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "43325eddce484c74b1e9ab0eff88eb79",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00002-of-00003.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f3dc0edad412436aacf998ca36f650f6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00001-of-00003.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bfa265b057804b13b0af8a46adbdb24c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/huseinzol05/dummy-clip-vit-base-patch16-malaysian-mistral-7b-32k-instructions-v3.5/commit/989076d8d3221599b26b93bf58e03ecc3bcdcbef', commit_message='Upload LlavaForConditionalGeneration', commit_description='', oid='989076d8d3221599b26b93bf58e03ecc3bcdcbef', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.push_to_hub('huseinzol05/dummy-clip-vit-base-patch16-malaysian-mistral-7b-32k-instructions-v3.5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f46e9a2f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
